{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "aug_data_path = \"/Users/minjoons/data/squad/train-v1.0-aug.json\"\n",
    "aug_data = json.load(open(aug_data_path, 'r'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(['Saint', 'Bernadette', 'Soubirous'], 'Saint Bernadette Soubirous')\n",
      "(['a', 'copper', 'statue', 'of', 'Christ'], 'a copper statue of Christ')\n",
      "(['the', 'Main', 'Building'], 'the Main Building')\n",
      "(['a', 'Marian', 'place', 'of', 'prayer', 'and', 'reflection'], 'a Marian place of prayer and reflection')\n"
     ]
    }
   ],
   "source": [
    "def compare_answers():\n",
    "    for article in aug_data['data']:\n",
    "        for para in article['paragraphs']:\n",
    "            deps = para['deps']\n",
    "            nodess = []\n",
    "            for dep in deps:\n",
    "                nodes, edges = dep\n",
    "                if dep is not None:\n",
    "                    nodess.append(nodes)\n",
    "                else:\n",
    "                    nodess.append([])\n",
    "            wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
    "            for qa in para['qas']:\n",
    "                for answer in qa['answers']:\n",
    "                    text = answer['text']\n",
    "                    word_start = answer['answer_word_start']\n",
    "                    word_stop = answer['answer_word_stop']\n",
    "                    answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]\n",
    "                    yield answer_words, text\n",
    "\n",
    "ca = compare_answers()\n",
    "print(next(ca))\n",
    "print(next(ca))\n",
    "print(next(ca))\n",
    "print(next(ca))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "q: k\n",
      "q: j\n",
      "q: n\n",
      "q: b\n",
      "q: v\n",
      "x: .\n",
      "x: :208\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "q: dd\n",
      "q: dd\n",
      "q: dd\n",
      "q: dd\n",
      "q: d\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: :411\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: :40\n",
      "x: .\n",
      "x: *\n",
      "x: :14\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: :131\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "x: .\n",
      "53 10\n"
     ]
    }
   ],
   "source": [
    "def nodep_counter():\n",
    "    x_count = 0\n",
    "    q_count = 0\n",
    "    for article in aug_data['data']:\n",
    "        for para in article['paragraphs']:\n",
    "            deps = para['deps']\n",
    "            nodess = []\n",
    "            for sent, dep in zip(para['sents'], deps):\n",
    "                if dep is None:\n",
    "                    print(\"x:\", sent)\n",
    "                    x_count += 1\n",
    "            for qa in para['qas']:\n",
    "                if qa['dep'] is None:\n",
    "                    print(\"q:\", qa['question'])\n",
    "                    q_count += 1\n",
    "    print(x_count, q_count)\n",
    "nodep_counter()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    }
   ],
   "source": [
    "def bad_node_counter():\n",
    "    count = 0\n",
    "    for article in aug_data['data']:\n",
    "        for para in article['paragraphs']:\n",
    "            sents = para['sents']\n",
    "            deps = para['deps']\n",
    "            nodess = []\n",
    "            for dep in deps:\n",
    "                if dep is not None:\n",
    "                    nodes, edges = dep\n",
    "                    for node in nodes:\n",
    "                        if len(node) != 5:\n",
    "                            count += 1\n",
    "    print(count)\n",
    "bad_node_counter()  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "36\n"
     ]
    }
   ],
   "source": [
    "def noanswer_counter():\n",
    "    count = 0\n",
    "    for article in aug_data['data']:\n",
    "        for para in article['paragraphs']:\n",
    "            deps = para['deps']\n",
    "            nodess = []\n",
    "            for dep in deps:\n",
    "                if dep is not None:\n",
    "                    nodes, edges = dep\n",
    "                    nodess.append(nodes)\n",
    "                else:\n",
    "                    nodess.append([])\n",
    "            wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
    "            for qa in para['qas']:\n",
    "                for answer in qa['answers']:\n",
    "                    text = answer['text']\n",
    "                    word_start = answer['answer_word_start']\n",
    "                    word_stop = answer['answer_word_stop']\n",
    "                    if word_start is None:\n",
    "                        count += 1\n",
    "    print(count)\n",
    "noanswer_counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "106\n"
     ]
    }
   ],
   "source": [
    "def mult_sent_answer_counter():\n",
    "    count = 0\n",
    "    for article in aug_data['data']:\n",
    "        for para in article['paragraphs']:\n",
    "            for qa in para['qas']:\n",
    "                for answer in qa['answers']:\n",
    "                    text = answer['text']\n",
    "                    word_start = answer['answer_word_start']\n",
    "                    word_stop = answer['answer_word_stop']\n",
    "                    if word_start is not None and word_start[0] != word_stop[0]:\n",
    "                        count += 1\n",
    "    print(count)\n",
    "mult_sent_answer_counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
